In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import shap
import optuna
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
Preprocessing¶
In [2]:
# Import data
data = pd.read_csv("creditcard.csv")
In [3]:
# Examine the data
data.head()
Out[3]:
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
In [4]:
# Info about the dataset
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
In [5]:
# Check for missing values
data.isna().sum()
Out[5]:
Time 0 V1 0 V2 0 V3 0 V4 0 V5 0 V6 0 V7 0 V8 0 V9 0 V10 0 V11 0 V12 0 V13 0 V14 0 V15 0 V16 0 V17 0 V18 0 V19 0 V20 0 V21 0 V22 0 V23 0 V24 0 V25 0 V26 0 V27 0 V28 0 Amount 0 Class 0 dtype: int64
In [6]:
# Descriptive statistics
data.describe().T
Out[6]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Time | 284807.0 | 9.481386e+04 | 47488.145955 | 0.000000 | 54201.500000 | 84692.000000 | 139320.500000 | 172792.000000 |
| V1 | 284807.0 | 1.168375e-15 | 1.958696 | -56.407510 | -0.920373 | 0.018109 | 1.315642 | 2.454930 |
| V2 | 284807.0 | 3.416908e-16 | 1.651309 | -72.715728 | -0.598550 | 0.065486 | 0.803724 | 22.057729 |
| V3 | 284807.0 | -1.379537e-15 | 1.516255 | -48.325589 | -0.890365 | 0.179846 | 1.027196 | 9.382558 |
| V4 | 284807.0 | 2.074095e-15 | 1.415869 | -5.683171 | -0.848640 | -0.019847 | 0.743341 | 16.875344 |
| V5 | 284807.0 | 9.604066e-16 | 1.380247 | -113.743307 | -0.691597 | -0.054336 | 0.611926 | 34.801666 |
| V6 | 284807.0 | 1.487313e-15 | 1.332271 | -26.160506 | -0.768296 | -0.274187 | 0.398565 | 73.301626 |
| V7 | 284807.0 | -5.556467e-16 | 1.237094 | -43.557242 | -0.554076 | 0.040103 | 0.570436 | 120.589494 |
| V8 | 284807.0 | 1.213481e-16 | 1.194353 | -73.216718 | -0.208630 | 0.022358 | 0.327346 | 20.007208 |
| V9 | 284807.0 | -2.406331e-15 | 1.098632 | -13.434066 | -0.643098 | -0.051429 | 0.597139 | 15.594995 |
| V10 | 284807.0 | 2.239053e-15 | 1.088850 | -24.588262 | -0.535426 | -0.092917 | 0.453923 | 23.745136 |
| V11 | 284807.0 | 1.673327e-15 | 1.020713 | -4.797473 | -0.762494 | -0.032757 | 0.739593 | 12.018913 |
| V12 | 284807.0 | -1.247012e-15 | 0.999201 | -18.683715 | -0.405571 | 0.140033 | 0.618238 | 7.848392 |
| V13 | 284807.0 | 8.190001e-16 | 0.995274 | -5.791881 | -0.648539 | -0.013568 | 0.662505 | 7.126883 |
| V14 | 284807.0 | 1.207294e-15 | 0.958596 | -19.214325 | -0.425574 | 0.050601 | 0.493150 | 10.526766 |
| V15 | 284807.0 | 4.887456e-15 | 0.915316 | -4.498945 | -0.582884 | 0.048072 | 0.648821 | 8.877742 |
| V16 | 284807.0 | 1.437716e-15 | 0.876253 | -14.129855 | -0.468037 | 0.066413 | 0.523296 | 17.315112 |
| V17 | 284807.0 | -3.772171e-16 | 0.849337 | -25.162799 | -0.483748 | -0.065676 | 0.399675 | 9.253526 |
| V18 | 284807.0 | 9.564149e-16 | 0.838176 | -9.498746 | -0.498850 | -0.003636 | 0.500807 | 5.041069 |
| V19 | 284807.0 | 1.039917e-15 | 0.814041 | -7.213527 | -0.456299 | 0.003735 | 0.458949 | 5.591971 |
| V20 | 284807.0 | 6.406204e-16 | 0.770925 | -54.497720 | -0.211721 | -0.062481 | 0.133041 | 39.420904 |
| V21 | 284807.0 | 1.654067e-16 | 0.734524 | -34.830382 | -0.228395 | -0.029450 | 0.186377 | 27.202839 |
| V22 | 284807.0 | -3.568593e-16 | 0.725702 | -10.933144 | -0.542350 | 0.006782 | 0.528554 | 10.503090 |
| V23 | 284807.0 | 2.578648e-16 | 0.624460 | -44.807735 | -0.161846 | -0.011193 | 0.147642 | 22.528412 |
| V24 | 284807.0 | 4.473266e-15 | 0.605647 | -2.836627 | -0.354586 | 0.040976 | 0.439527 | 4.584549 |
| V25 | 284807.0 | 5.340915e-16 | 0.521278 | -10.295397 | -0.317145 | 0.016594 | 0.350716 | 7.519589 |
| V26 | 284807.0 | 1.683437e-15 | 0.482227 | -2.604551 | -0.326984 | -0.052139 | 0.240952 | 3.517346 |
| V27 | 284807.0 | -3.660091e-16 | 0.403632 | -22.565679 | -0.070840 | 0.001342 | 0.091045 | 31.612198 |
| V28 | 284807.0 | -1.227390e-16 | 0.330083 | -15.430084 | -0.052960 | 0.011244 | 0.078280 | 33.847808 |
| Amount | 284807.0 | 8.834962e+01 | 250.120109 | 0.000000 | 5.600000 | 22.000000 | 77.165000 | 25691.160000 |
| Class | 284807.0 | 1.727486e-03 | 0.041527 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
In [7]:
# Set the maximum number of rows to display to None (to display all rows)
pd.set_option('display.max_rows', None)
# Grouped statistics by Class
data.groupby(['Class']).describe().T
Out[7]:
| Class | 0 | 1 | |
|---|---|---|---|
| Time | count | 284315.000000 | 492.000000 |
| mean | 94838.202258 | 80746.806911 | |
| std | 47484.015786 | 47835.365138 | |
| min | 0.000000 | 406.000000 | |
| 25% | 54230.000000 | 41241.500000 | |
| 50% | 84711.000000 | 75568.500000 | |
| 75% | 139333.000000 | 128483.000000 | |
| max | 172792.000000 | 170348.000000 | |
| V1 | count | 284315.000000 | 492.000000 |
| mean | 0.008258 | -4.771948 | |
| std | 1.929814 | 6.783687 | |
| min | -56.407510 | -30.552380 | |
| 25% | -0.917544 | -6.036063 | |
| 50% | 0.020023 | -2.342497 | |
| 75% | 1.316218 | -0.419200 | |
| max | 2.454930 | 2.132386 | |
| V2 | count | 284315.000000 | 492.000000 |
| mean | -0.006271 | 3.623778 | |
| std | 1.636146 | 4.291216 | |
| min | -72.715728 | -8.402154 | |
| 25% | -0.599473 | 1.188226 | |
| 50% | 0.064070 | 2.717869 | |
| 75% | 0.800446 | 4.971257 | |
| max | 18.902453 | 22.057729 | |
| V3 | count | 284315.000000 | 492.000000 |
| mean | 0.012171 | -7.033281 | |
| std | 1.459429 | 7.110937 | |
| min | -48.325589 | -31.103685 | |
| 25% | -0.884541 | -8.643489 | |
| 50% | 0.182158 | -5.075257 | |
| 75% | 1.028372 | -2.276185 | |
| max | 9.382558 | 2.250210 | |
| V4 | count | 284315.000000 | 492.000000 |
| mean | -0.007860 | 4.542029 | |
| std | 1.399333 | 2.873318 | |
| min | -5.683171 | -1.313275 | |
| 25% | -0.850077 | 2.373050 | |
| 50% | -0.022405 | 4.177147 | |
| 75% | 0.737624 | 6.348729 | |
| max | 16.875344 | 12.114672 | |
| V5 | count | 284315.000000 | 492.000000 |
| mean | 0.005453 | -3.151225 | |
| std | 1.356952 | 5.372468 | |
| min | -113.743307 | -22.105532 | |
| 25% | -0.689398 | -4.792835 | |
| 50% | -0.053457 | -1.522962 | |
| 75% | 0.612181 | 0.214562 | |
| max | 34.801666 | 11.095089 | |
| V6 | count | 284315.000000 | 492.000000 |
| mean | 0.002419 | -1.397737 | |
| std | 1.329913 | 1.858124 | |
| min | -26.160506 | -6.406267 | |
| 25% | -0.766847 | -2.501511 | |
| 50% | -0.273123 | -1.424616 | |
| 75% | 0.399619 | -0.413216 | |
| max | 73.301626 | 6.474115 | |
| V7 | count | 284315.000000 | 492.000000 |
| mean | 0.009637 | -5.568731 | |
| std | 1.178812 | 7.206773 | |
| min | -31.764946 | -43.557242 | |
| 25% | -0.551442 | -7.965295 | |
| 50% | 0.041138 | -3.034402 | |
| 75% | 0.571019 | -0.945954 | |
| max | 120.589494 | 5.802537 | |
| V8 | count | 284315.000000 | 492.000000 |
| mean | -0.000987 | 0.570636 | |
| std | 1.161283 | 6.797831 | |
| min | -73.216718 | -41.044261 | |
| 25% | -0.208633 | -0.195336 | |
| 50% | 0.022041 | 0.621508 | |
| 75% | 0.326200 | 1.764879 | |
| max | 18.709255 | 20.007208 | |
| V9 | count | 284315.000000 | 492.000000 |
| mean | 0.004467 | -2.581123 | |
| std | 1.089372 | 2.500896 | |
| min | -6.290730 | -13.434066 | |
| 25% | -0.640412 | -3.872383 | |
| 50% | -0.049964 | -2.208768 | |
| 75% | 0.598230 | -0.787850 | |
| max | 15.594995 | 3.353525 | |
| V10 | count | 284315.000000 | 492.000000 |
| mean | 0.009824 | -5.676883 | |
| std | 1.044204 | 4.897341 | |
| min | -14.741096 | -24.588262 | |
| 25% | -0.532880 | -7.756698 | |
| 50% | -0.091872 | -4.578825 | |
| 75% | 0.455135 | -2.614184 | |
| max | 23.745136 | 4.031435 | |
| V11 | count | 284315.000000 | 492.000000 |
| mean | -0.006576 | 3.800173 | |
| std | 1.003112 | 2.678605 | |
| min | -4.797473 | -1.702228 | |
| 25% | -0.763447 | 1.973397 | |
| 50% | -0.034923 | 3.586218 | |
| 75% | 0.736362 | 5.307078 | |
| max | 10.002190 | 12.018913 | |
| V12 | count | 284315.000000 | 492.000000 |
| mean | 0.010832 | -6.259393 | |
| std | 0.945939 | 4.654458 | |
| min | -15.144988 | -18.683715 | |
| 25% | -0.402102 | -8.688177 | |
| 50% | 0.141679 | -5.502530 | |
| 75% | 0.619207 | -2.974088 | |
| max | 7.848392 | 1.375941 | |
| V13 | count | 284315.000000 | 492.000000 |
| mean | 0.000189 | -0.109334 | |
| std | 0.995067 | 1.104518 | |
| min | -5.791881 | -3.127795 | |
| 25% | -0.648067 | -0.979117 | |
| 50% | -0.013547 | -0.065566 | |
| 75% | 0.662492 | 0.672964 | |
| max | 7.126883 | 2.815440 | |
| V14 | count | 284315.000000 | 492.000000 |
| mean | 0.012064 | -6.971723 | |
| std | 0.897007 | 4.278940 | |
| min | -18.392091 | -19.214325 | |
| 25% | -0.422453 | -9.692723 | |
| 50% | 0.051947 | -6.729720 | |
| 75% | 0.494104 | -4.282821 | |
| max | 10.526766 | 3.442422 | |
| V15 | count | 284315.000000 | 492.000000 |
| mean | 0.000161 | -0.092929 | |
| std | 0.915060 | 1.049915 | |
| min | -4.391307 | -4.498945 | |
| 25% | -0.582812 | -0.643539 | |
| 50% | 0.048294 | -0.057227 | |
| 75% | 0.648842 | 0.609189 | |
| max | 8.877742 | 2.471358 | |
| V16 | count | 284315.000000 | 492.000000 |
| mean | 0.007164 | -4.139946 | |
| std | 0.844772 | 3.865035 | |
| min | -10.115560 | -14.129855 | |
| 25% | -0.465543 | -6.562915 | |
| 50% | 0.067377 | -3.549795 | |
| 75% | 0.523738 | -1.226043 | |
| max | 17.315112 | 3.139656 | |
| V17 | count | 284315.000000 | 492.000000 |
| mean | 0.011535 | -6.665836 | |
| std | 0.749457 | 6.970618 | |
| min | -17.098444 | -25.162799 | |
| 25% | -0.482644 | -11.945057 | |
| 50% | -0.064833 | -5.302949 | |
| 75% | 0.399922 | -1.341940 | |
| max | 9.253526 | 6.739384 | |
| V18 | count | 284315.000000 | 492.000000 |
| mean | 0.003887 | -2.246308 | |
| std | 0.824919 | 2.899366 | |
| min | -5.366660 | -9.498746 | |
| 25% | -0.497414 | -4.664576 | |
| 50% | -0.002787 | -1.664346 | |
| 75% | 0.501103 | 0.091772 | |
| max | 5.041069 | 3.790316 | |
| V19 | count | 284315.000000 | 492.000000 |
| mean | -0.001178 | 0.680659 | |
| std | 0.811733 | 1.539853 | |
| min | -7.213527 | -3.681904 | |
| 25% | -0.456366 | -0.299423 | |
| 50% | 0.003117 | 0.646807 | |
| 75% | 0.457499 | 1.649318 | |
| max | 5.591971 | 5.228342 | |
| V20 | count | 284315.000000 | 492.000000 |
| mean | -0.000644 | 0.372319 | |
| std | 0.769404 | 1.346635 | |
| min | -54.497720 | -4.128186 | |
| 25% | -0.211764 | -0.171760 | |
| 50% | -0.062646 | 0.284693 | |
| 75% | 0.132401 | 0.822445 | |
| max | 39.420904 | 11.059004 | |
| V21 | count | 284315.000000 | 492.000000 |
| mean | -0.001235 | 0.713588 | |
| std | 0.716743 | 3.869304 | |
| min | -34.830382 | -22.797604 | |
| 25% | -0.228509 | 0.041787 | |
| 50% | -0.029821 | 0.592146 | |
| 75% | 0.185626 | 1.244611 | |
| max | 22.614889 | 27.202839 | |
| V22 | count | 284315.000000 | 492.000000 |
| mean | -0.000024 | 0.014049 | |
| std | 0.723668 | 1.494602 | |
| min | -10.933144 | -8.887017 | |
| 25% | -0.542403 | -0.533764 | |
| 50% | 0.006736 | 0.048434 | |
| 75% | 0.528407 | 0.617474 | |
| max | 10.503090 | 8.361985 | |
| V23 | count | 284315.000000 | 492.000000 |
| mean | 0.000070 | -0.040308 | |
| std | 0.621541 | 1.579642 | |
| min | -44.807735 | -19.254328 | |
| 25% | -0.161702 | -0.342175 | |
| 50% | -0.011147 | -0.073135 | |
| 75% | 0.147522 | 0.308378 | |
| max | 22.528412 | 5.466230 | |
| V24 | count | 284315.000000 | 492.000000 |
| mean | 0.000182 | -0.105130 | |
| std | 0.605776 | 0.515577 | |
| min | -2.836627 | -2.028024 | |
| 25% | -0.354425 | -0.436809 | |
| 50% | 0.041082 | -0.060795 | |
| 75% | 0.439869 | 0.285328 | |
| max | 4.584549 | 1.091435 | |
| V25 | count | 284315.000000 | 492.000000 |
| mean | -0.000072 | 0.041449 | |
| std | 0.520673 | 0.797205 | |
| min | -10.295397 | -4.781606 | |
| 25% | -0.317145 | -0.314348 | |
| 50% | 0.016417 | 0.088371 | |
| 75% | 0.350594 | 0.456515 | |
| max | 7.519589 | 2.208209 | |
| V26 | count | 284315.000000 | 492.000000 |
| mean | -0.000089 | 0.051648 | |
| std | 0.482241 | 0.471679 | |
| min | -2.604551 | -1.152671 | |
| 25% | -0.327074 | -0.259416 | |
| 50% | -0.052227 | 0.004321 | |
| 75% | 0.240671 | 0.396733 | |
| max | 3.517346 | 2.745261 | |
| V27 | count | 284315.000000 | 492.000000 |
| mean | -0.000295 | 0.170575 | |
| std | 0.399847 | 1.376766 | |
| min | -22.565679 | -7.263482 | |
| 25% | -0.070852 | -0.020025 | |
| 50% | 0.001230 | 0.394926 | |
| 75% | 0.090573 | 0.826029 | |
| max | 31.612198 | 3.052358 | |
| V28 | count | 284315.000000 | 492.000000 |
| mean | -0.000131 | 0.075667 | |
| std | 0.329570 | 0.547291 | |
| min | -15.430084 | -1.869290 | |
| 25% | -0.052950 | -0.108868 | |
| 50% | 0.011199 | 0.146344 | |
| 75% | 0.077962 | 0.381152 | |
| max | 33.847808 | 1.779364 | |
| Amount | count | 284315.000000 | 492.000000 |
| mean | 88.291022 | 122.211321 | |
| std | 250.105092 | 256.683288 | |
| min | 0.000000 | 0.000000 | |
| 25% | 5.650000 | 1.000000 | |
| 50% | 22.000000 | 9.250000 | |
| 75% | 77.050000 | 105.890000 | |
| max | 25691.160000 | 2125.870000 |
Explororatory data analysis (EDA)¶
In [8]:
# Set the Seaborn theme to a whitegrid style
sns.set_theme(style="whitegrid")
In [9]:
# Histograms and boxplots for features
for col in data.columns:
print(col)
print('Skew :', round(data[col].skew(), 2))
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
data[col].hist(grid=False)
plt.ylabel('count')
plt.subplot(1, 2, 2)
sns.boxplot(x=data[col])
plt.show()
Time Skew : -0.04
V1 Skew : -3.28
V2 Skew : -4.62
V3 Skew : -2.24
V4 Skew : 0.68
V5 Skew : -2.43
V6 Skew : 1.83
V7 Skew : 2.55
V8 Skew : -8.52
V9 Skew : 0.55
V10 Skew : 1.19
V11 Skew : 0.36
V12 Skew : -2.28
V13 Skew : 0.07
V14 Skew : -2.0
V15 Skew : -0.31
V16 Skew : -1.1
V17 Skew : -3.84
V18 Skew : -0.26
V19 Skew : 0.11
V20 Skew : -2.04
V21 Skew : 3.59
V22 Skew : -0.21
V23 Skew : -5.88
V24 Skew : -0.55
V25 Skew : -0.42
V26 Skew : 0.58
V27 Skew : -1.17
V28 Skew : 11.19
Amount Skew : 16.98
Class Skew : 24.0
In [10]:
# Distribution of Class
for col in data.columns:
ax = sns.violinplot(x='Class', y=col, data=data)
plt.show()
In [11]:
# Undersampling for visualisations
fraud_df = data.loc[data['Class'] == 1]
non_fraud_df = data.loc[data['Class'] == 0].sample(492)
data_vis = pd.concat([fraud_df, non_fraud_df])
In [12]:
# Plot pairplot between numeric columns
sns.set_theme(style="whitegrid")
sns.pairplot(data_vis, height=3, hue="Class", palette='bright')
sns.set(font_scale=0.8)